##### Biterm Topic Model        #----------------------------------------------#

# This script runs the biterm topic modes based on the lemmatized data from ACLED.
# It also generates the content for Table 1.

#------------- Run BTM --------------------------------------------

# Load tokens data
load("output/acled_tokens_final.Rda")

# Set random seed to ensure replicability
set.seed(06032022)

# run biterm topics model
BTM_acled <- BTM::BTM(acled_dat, k = 10, beta = 0.05, alpha = 0.1,
                      iter = 750, trace = TRUE, background = T,
                      detailed = T)

# Save the model's log Likelihood (takes a while)
#fit <- logLik(BTM_acled)
#fit$ll

#------------- Table 1 --------------------------------------------


# find top 15 terms per topic
topicterms <- terms(BTM_acled, top_n = 15)
topicterms

# predict topic probabilities and save as dataframe, make sure that the x_sub 
# data frame is loaded.
scores_pred <- predict(BTM_acled, newdata = x_sub, type = "sum_b")

scores <- as.data.frame(scores_pred) %>% 
  mutate(data_id = row.names(scores_pred)) %>% 
  select(data_id, everything())

# Extract top words per topic in separate data frame
df1 <- lapply(topicterms, `[[`, 1)   
df_topics <- data.frame(matrix(unlist(df1), nrow = 15, byrow=F))

# name topics based on common terms
topicNames_BTM <- apply(df_topics[1:3,], 2, paste0, collapse = "_") 

# define old variable names
old <- colnames(scores[2:11])

# rename topics from numerical values to terms
scores_BTM <- scores  %>% 
  rename_at(all_of(old), ~ topicNames_BTM)

# get average topic prevalence
summary(scores_BTM)

# merge acled events and topics and merge topics
acled_btm <- left_join(corona_protests,  scores, by=c("data_id")) %>% 
  filter(year > 2019 & year < 2023) %>% 
  mutate(health_care = V3,
         imprisonment_crime = V6,
         restrictions_masks = V4,
         vaccination = V5,
         mishandling_corruption = V10,
         business_restrictions = V2 ,
         economic_consequences = V8,
         education = V9,
  ) %>% 
  select(-(V1:V10)) %>% 
  mutate(cowcode = countrycode::countrycode(country, "country.name", "cown")) %>% 
  drop_na(cowcode)

# calculate most likely topic per unit
acled_btm$max_topic <- names(acled_btm[10:15])[max.col(replace(acled_btm[10:15], is.na(acled_btm[10:15]), -999), 'random')]

save(acled_btm, file = "output/acled_btm.Rda")

#------------- Examine coherence using Quanteda (Figure A.2) --------------------------------------------

# There are no functions in the BTM package to calculate topic coherence, 
# therefore, I use textmineR to do that

# format to document feature matrix
dfmat_acled <- dfm(toks_nopunct)
print(dfmat_acled)

# remove very frequent and infrequent words
dfmat_acled_trim <- dfm_trim(dfmat_acled, min_termfreq = 10,
                             max_docfreq = .1,
                             docfreq_type = "prop",
                             verbose = T)
# summary
print(dfmat_acled_trim)

# top features
topfeatures(dfmat_acled_trim, 100)


dtm2matrix <- as(dfmat_acled_trim, "dgCMatrix")

# choose number of topics
k_list <- seq(5,50, by = 2)


model_list <- TmParallelApply(X = k_list, FUN = function(k){
  
  m <- FitLdaModel(dtm = dtm2matrix, 
                   k = k, 
                   iterations = 500, 
                   burnin = 100,
                   alpha = 0.1,
                   beta = .05,
#                   optimize_alpha = TRUE,
                   calc_likelihood = TRUE,
                   calc_coherence = TRUE,
                   calc_r2 = TRUE)
  m$k <- k
  
  m
}, export= ls(), 
cpus = 3) 

# Get average coherence for each model
coherence_mat <- data.frame(k = sapply(model_list, function(x) nrow(x$phi)), 
                            coherence = sapply(model_list, function(x) mean(x$coherence)), 
                            stringsAsFactors = FALSE)


# Plot the result
pdf("output/Figure_A2.pdf", width = 7, height = 5)
plot(coherence_mat, type = "o")
dev.off()
